hw7

Author

Ryan Klein

Homework 7

Import Packages

import altair as alt
import pandas as pd
from altair import datum
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')

Part 1

gas_gap_data_url = "https://calvin-data304.netlify.app/data/pump_price_for_gasoline_us_per_liter.csv"
gas_gap_data = pd.read_csv(gas_gap_data_url)
gas_gap_data.head()
country 1991 1992 1993 1994 1995 1996 1997 1998 1999 ... 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
0 Afghanistan NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 1.05 NaN 1.15 NaN 1.28 NaN 1.07 NaN 0.7
1 Angola NaN NaN NaN NaN NaN NaN NaN 0.38 NaN ... NaN 0.53 NaN 0.65 NaN 0.63 NaN 0.76 NaN 0.97
2 Albania NaN NaN NaN NaN NaN NaN NaN 0.86 NaN ... NaN 1.36 NaN 1.46 NaN 1.81 NaN 1.76 NaN 1.36
3 Andorra NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 1.24 NaN 1.49 NaN 1.67 NaN 1.51 NaN NaN
4 UAE NaN NaN NaN NaN NaN NaN NaN 0.23 NaN ... NaN 0.45 NaN 0.47 NaN 0.47 NaN 0.47 NaN 0.49

5 rows × 27 columns

We account for several label discrepencies between the datasets.

gas_gap_data.at[35, 'country'] = "Dem. Rep. Congo"
gas_gap_data.at[172, 'country'] = "United States of America"
gas_gap_data.at[28, 'country'] = "Central African Rep."
gas_gap_data.at[58, 'country'] = "United Kingdom"
gas_gap_data.at[152, 'country'] = "S. Sudan"
gas_gap_data.at[85, 'country'] = "Kyrgyzstan"
gas_gap_data.at[19, 'country'] = "Bosnia and Herz."
gas_gap_data.at[33, 'country'] = "Côte d'Ivoire"
gas_gap_data.at[4, 'country'] = "United Arab Emirates"
gas_gap_data.at[46, 'country'] = "Dominican Rep."
gas_gap_data.at[108, 'country'] = "Macedonia"
countries = alt.topo_feature('https://cdn.jsdelivr.net/npm/world-atlas@2/countries-110m.json', feature='countries')
country_map = alt.Chart(countries).mark_geoshape(
    fill='#aaaaaa',
    stroke='#000000'
).project('mercator')

country_map.properties(width = 600, height = 400)
country_map.transform_lookup(
  lookup='properties.name',
  from_=alt.LookupData(gas_gap_data, 'country', ['2012'])
  ).encode(
    fill = "2012:Q",
    tooltip = ["properties.name:O", "2012:Q"]
    ).properties(width = 600, height = 400, title="Gas rate: US $ per liter")

Part 2

democracy_url = "https://calvin-data304.netlify.app/data/wvs.csv"
democracy_data = pd.read_csv(democracy_url)

Wrangle the total number of respondants for each nation

respondants_per_country = pd.DataFrame(democracy_data.value_counts("country"))
respondants_per_country.reset_index(inplace=True)
respondants_per_country = respondants_per_country.rename(columns={0:"total"})
base = alt.Chart(respondants_per_country).encode(
    alt.X(field ='country', type="ordinal", sort="-y"),
    alt.Y(field ='total', type="quantitative")
).properties(width=300,height=300,title="Number of Respondants")
base.mark_bar()

Part 3

Age3

age3_facet = alt.Chart(democracy_data).mark_boxplot(extent="min-max").encode(
    alt.X(field = "age", type = "quantitative", title = "Age in Years"),
    alt.Y(field = "age3", type = "nominal"),
    alt.Color("age3:N")
    #alt.Tooltip(["min(age)", "max(age)"])
).properties(
  width = 300, height = 75
).facet(
  facet = "country:O",
  columns=3
)
age3_facet

Age6

age6_facet = alt.Chart(democracy_data).mark_boxplot(extent="min-max").encode(
    alt.X(field = "age", type = "quantitative", title = "Age in Years"),
    alt.Y(field = "age6", type = "nominal"),
    alt.Color("age6:N")
).properties(
  width = 300, height = 150
).facet(
  facet = "country:O",
  columns=3
)
age6_facet

Part 4

Skipped. Will come back to later if time allows.

Part 5

lines = alt.Chart(democracy_data).mark_line().encode(
    x=alt.X(field = "age6", type = "ordinal", sort = "-x"),
    y=alt.Y("mean(democracy_importance):Q"),
).properties(
  width = 200, height = 400
)

bands = alt.Chart(democracy_data).mark_errorband().encode(
    x=alt.X(field = "age6", type = "ordinal", sort = "-x", title = "Age Grouping"),
    y=alt.Y(field = "democracy_importance", type = "quantitative", title = "Average importance of democracy"),
).properties(
  width = 200, height = 400,
  title = ""
)
alt.layer(lines + bands).facet(
  facet = "country:O"
)

Part 6

lines = alt.Chart(democracy_data).mark_line().encode(
    x=alt.X(field = "age", type = "ordinal", sort = "-x"),
    y=alt.Y("mean(democracy_importance):Q"),
).properties(
  width = 200, height = 400
)

bands = alt.Chart(democracy_data).mark_errorband().encode(
    x=alt.X(field = "age", type = "ordinal", sort = "-x", title = "Age Grouping"),
    y=alt.Y(field = "democracy_importance", type = "quantitative", title = "Average importance of democracy"),
).properties(
  width = 200, height = 400,
  title = ""
)
alt.layer(lines + bands).facet(
  facet = "country:O"
)

Using “age” instead of “age6” makes the graphic worse, because “age” will calculate the statistics for each of the ages present in the dataset. On the other hand, the “age6” is useful, because it bins each of the cases into 6 bins that are defined by an age range. This makes the plot much less chaotic and much more interpretible.

Part 7

loess_chart_base = alt.Chart(democracy_data).encode(
  x=alt.X(field = "age6", type = "ordinal", sort = "-x", title = "Age Grouping"),
  y=alt.Y(field = "democracy_importance", type = "quantitative", title = "Average importance of democracy"),
  ).properties(
    width = 200, height = 400,
    title = ""
  )

loess_chart_base.transform_loess('age6', 'democracy_importance').mark_line().facet(
    facet = "country:O"
  )